################################################################################
# FILE 6. DISTRIBUTION OF SEAT ACCURACY
################################################################################
# PURPOSE:
#   1) Load and harmonize Flanders survey datasets (citizen + politician, 2023/2024).
#   2) Re-scale the seats-accuracy outcome to the intended 0–7 scale.
#   3) Produce a publication-quality violin + boxplot figure comparing the
#      distribution of seat-accuracy by:
#        - survey wave (2023, 2024, Both)
#        - respondent type (citizen vs politician)
#   4) Fit baseline regression models predicting seat accuracy.
#
# INPUTS (RDS):
#   - cf_flanders_2023_data.rds   (citizen Flanders 2023)
#   - cf_flanders_2024_data.rds   (citizen Flanders 2024)
#   - cf_flanders_data.rds        (citizen pooled)
#   - pf_flanders_2023_data.rds   (politician Flanders 2023)
#   - pf_flanders_2024_data.rds   (politician Flanders 2024)
#   - pf_flanders_data.rds        (politician pooled)
#   - flanders_data.rds           (merged / analysis dataset used for plots/models)
#
# OUTPUTS:
#   - seats_violon.png           (10x7 inches @ 1200 dpi)
#
# KEY VARIABLES (expected in `flanders.data`):
#   - seats_accuracy : numeric outcome (target scale 0–7)
#   - type           : group indicator (0=citizens, 1=politicians)
#   - survey         : wave indicator ("2023", "2024") used for grouping
#   - seats_winner   : predictor (winner / correctness indicator)
#   - sex, age       : controls
################################################################################

################################################################################
# 1. WORKING DIRECTORY + FILE LOCATIONS
################################################################################

getwd()

# Set project directory (update it to your own directory that contains the input files)
setwd("updated_path_here")

# Centralize file names so changes are one-line edits
out_plot_seats <- "seats_violon.png"

################################################################################
# 2. LOAD DATASETS
################################################################################

# ---- Citizen datasets ----
cf.flanders.2023.data <- readRDS("cf_flanders_2023_data.rds")
cf.flanders.2024.data <- readRDS("cf_flanders_2024_data.rds")
cf.flanders.data      <- readRDS("cf_flanders_data.rds")       # pooled citizens

# ---- Politician datasets ----
pf.flanders.2023.data <- readRDS("pf_flanders_2023_data.rds")
pf.flanders.2024.data <- readRDS("pf_flanders_2024_data.rds")
pf.flanders.data      <- readRDS("pf_flanders_data.rds")       # pooled politicians

# ---- Merged / analysis dataset ----
flanders.data <- readRDS("flanders_data.rds")


################################################################################
# 3. OUTCOME PREP: RESCALE SEATS ACCURACY
################################################################################
# The original seats_accuracy scale is 1–8; it should be 0–7.
# Subtracting 1 shifts the scale while preserving ordering and differences.

flanders.data$seats_accuracy <- flanders.data$seats_accuracy - 1

################################################################################
# 4. DESCRIPTIVES: CITIZENS VS POLITICIANS (OVERALL DISTRIBUTIONS)
################################################################################
# Goal: Compare distributions by (survey wave × respondent type), and add a
# combined "Both" group that pools across 2023/2024 within type.

# ---- 4.1 Create "Both" group by duplicating rows and relabeling survey ----
# This produces three survey labels:
#   - "2023" : original 2023 rows
#   - "2024" : original 2024 rows
#   - "Both" : duplicated rows pooling across both waves
flanders.data.combined <- dplyr::bind_rows(
  flanders.data,
  dplyr::mutate(flanders.data, survey = "Both")
)

# ---- 4.2 Remove non-finite outcomes ----
# Defensive cleaning: removes Inf/-Inf and also implicitly filters out NaN.
# (NA is not finite either, so NA rows drop here as well.)
flanders.data.combined <- dplyr::filter(
  flanders.data.combined,
  is.finite(seats_accuracy)
)

# ---- 4.3 Set factor levels for stable ordering in plots ----
# Ensures x-axis is ordered: 2023, 2024, Both (rather than alphabetical)
flanders.data.combined$survey <- factor(
  flanders.data.combined$survey,
  levels = c("2023", "2024", "Both")
)

# ---- 4.4 Compute mean labels (for annotation on the figure) ----
# Means are computed within each (survey × type) cell.
mean_labels <- flanders.data.combined %>%
  dplyr::group_by(survey, type) %>%
  dplyr::summarise(
    avg = mean(seats_accuracy, na.rm = TRUE),
    .groups = "drop"
  )

################################################################################
# 5. FIGURE: VIOLIN + BOXPLOT + MEAN + MEAN LABELS
################################################################################
# Plot components:
#   - Violin: full distribution shape
#   - Boxplot: median/IQR + outliers
#   - Mean point: visual anchor for average
#   - Mean text labels: numeric mean printed near the mean point
#
# Aesthetics:
#   - x: survey wave (2023/2024/Both)
#   - y: seat accuracy (0–7; display 0–8 to keep headroom)
#   - fill: type (citizens vs politicians)

png(out_plot_seats, units = "in", width = 10, height = 7, res = 1200)

ggplot(flanders.data.combined, aes(x = survey, y = seats_accuracy, fill = factor(type))) +
  
  # ---- 5.1 Distribution shape (violin) ----
# trim = FALSE keeps full kernel density tails
# position_dodge aligns citizens vs politicians side-by-side within each wave
geom_violin(
  trim     = FALSE,
  alpha    = 0.6,
  position = position_dodge(width = 0.8),
  color    = NA
) +
  
  # ---- 5.2 Central tendency & spread (boxplot) ----
# Width kept narrow to sit "inside" the violin.
geom_boxplot(
  width         = 0.1,
  position      = position_dodge(width = 0.8),
  outlier.size  = 0.8,
  outlier.alpha = 0.5,
  alpha         = 0.4
) +
  
  # ---- 5.3 Mean marker (black dot) ----
# stat_summary calculates mean within each group and draws a point.
stat_summary(
  fun      = mean,
  geom     = "point",
  shape    = 20,
  size     = 2,
  color    = "black",
  position = position_dodge(width = 0.8)
) +
  
  # ---- 5.4 Mean labels (text near mean point) ----
# The x = as.numeric(survey) + 0.125 "nudges" labels horizontally so they
# don’t collide with the mean dot and/or dodge positions.
geom_text(
  data = mean_labels,
  aes(
    x     = as.numeric(survey) + 0.125,
    y     = avg,
    label = sprintf("%.2f", avg),
    group = factor(type),
    fill  = NULL
  ),
  position = position_dodge(width = 0.8),
  vjust    = -0.5,
  size     = 5,
  color    = "black"
) +
  
  # ---- 5.5 Axes ----
# limits slightly above 7 so labels have headroom
scale_y_continuous(
  limits = c(0, 8),
  breaks = 0:7,
  expand = expansion(mult = c(0, 0.05))
) +
  
  # ---- 5.6 Fill legend mapping ----
# type is encoded as "0" and "1" in your data; relabel for readability.
scale_fill_manual(
  values = c("0" = "#1f77b4", "1" = "#ff7f0e"),
  labels = c("0" = "Citizens", "1" = "Politicians"),
  name   = NULL
) +
  
  # ---- 5.7 Labels ----
labs(
  x = "Survey Year",
  y = "Seat Accuracy"
) +
  
  # ---- 5.8 Theme ----
theme_bw() +
  theme(
    legend.position  = "bottom",
    legend.key.size  = unit(15, "pt"),
    legend.text      = element_text(size = 17),
    legend.margin    = margin(t = 10, b = 0),
    axis.title.x     = element_text(size = 19, margin = margin(t = 10)),
    axis.title.y     = element_text(size = 19, margin = margin(r = 10)),
    axis.text.x      = element_text(size = 17),
    axis.text.y      = element_text(size = 17),
    plot.title       = element_blank()
  )

dev.off()

# =============================================================================
# END OF SCRIPT
# =============================================================================